{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys,random\n",
    "import pickle\n",
    "import os\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import modin.pandas as pd\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "if not os.path.exists('featurescore'):\n",
    "    os.mkdir('featurescore')\n",
    "if not os.path.exists('model'):\n",
    "    os.mkdir('model')\n",
    "if not os.path.exists('preds'):\n",
    "    os.mkdir('preds')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 58.3 s, sys: 2.81 s, total: 1min 1s\n",
      "Wall time: 1min 9s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "data_date = pd.read_csv('../../preprocess_data/train_x_date.csv').drop(columns=['id','loan_hour'])\n",
    "data_raw = pd.read_csv('../../preprocess_data_new/train_ax_nodup.csv',nrows=33465).drop(columns=['id','loan_dt','tag'])\n",
    "# data_raw = pd.read_csv('../../preprocess_data/train_x_33465.csv').drop(columns=['id','loan_dt','tag'])\n",
    "# data_null = pd.read_csv('../../preprocess_data/train_x_null.csv').drop(columns=['id'])\n",
    "data_null = pd.read_csv('../../preprocess_data_new/train_ax_row_null.csv',nrows=33465).drop(columns=['id'])\n",
    "data_tag = pd.read_csv('../../preprocess_data/train_x_33465.csv',usecols=['tag'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# data_raw_nan = data_raw.dropna(axis=1,how='all')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 2.56 s, sys: 208 ms, total: 2.76 s\n",
      "Wall time: 2.76 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "data = pd.concat([data_date,data_raw,data_null,data_tag],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 8 ms, sys: 0 ns, total: 8 ms\n",
      "Wall time: 17.1 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "data_label = pd.read_csv('../../preprocess_data/train_y_33465.csv',usecols=['label'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(33465, 6702)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "使用旧数据集 train集的维度为：  \n",
    "使用新数据集 train集的维度为：6702"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## MIC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from minepy import MINE\n",
    "mic = MINE()\n",
    "mic_result = pd.Series()\n",
    "x = data.fillna(-1)\n",
    "for col in data.columns:\n",
    "    mic_result[col] = mic.compute_score(x[col].values.ravel(), data_label.label.values.ravel())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mic_result"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 本地验证"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split # 分割数据模块\n",
    "from sklearn.metrics import roc_curve\n",
    "from sklearn import metrics\n",
    "\n",
    "#1.分割数据\n",
    "\n",
    "# x的挑选\n",
    "feature_names = list(data.columns)\n",
    "x= data[feature_names].fillna(-1).values\n",
    "y=data_label.values.ravel()\n",
    "X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=3096)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10040, 6702)"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "开始训练！\n",
      "      Iter       Train Loss   Remaining Time \n",
      "         1           0.4727           61.35m\n",
      "         2           0.4628           60.42m\n",
      "         3           0.4555           60.67m\n",
      "         4           0.4488           60.33m\n",
      "         5           0.4424           60.32m\n",
      "         6           0.4373           60.37m\n",
      "         7           0.4324           60.06m\n",
      "         8           0.4281           59.58m\n",
      "         9           0.4241           59.37m\n",
      "        10           0.4206           58.96m\n",
      "        20           0.3949           56.20m\n",
      "        30           0.3785           53.83m\n",
      "        40           0.3650           51.42m\n",
      "        50           0.3547           48.97m\n",
      "        60           0.3465           46.43m\n",
      "        70           0.3385           44.18m\n",
      "        80           0.3327           41.66m\n",
      "        90           0.3288           39.19m\n",
      "       100           0.3240           37.05m\n",
      "       200           0.2850           17.83m\n",
      "       300           0.2569            0.00s\n",
      "CPU times: user 52min 40s, sys: 592 ms, total: 52min 40s\n",
      "Wall time: 52min 40s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "#2.建立模型 \n",
    "# from sklearn.svm import LinearSVC\n",
    "# lsvm = LinearSVC(random_state=2018, max_iter =1000)\n",
    "\n",
    "# from sklearn.ensemble import RandomForestClassifier\n",
    "# rfc  =RandomForestClassifier(n_estimators=2500,n_jobs =36,max_features='sqrt',class_weight='balanced',verbose =1,random_state=2018)\n",
    "\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "gbc = GradientBoostingClassifier(loss='deviance',learning_rate =0.1,n_estimators=500,subsample=0.9,max_depth=7,verbose=1,random_state=2018)\n",
    "\n",
    "#3.训练模型\n",
    "print('开始训练！')\n",
    "# lsvm.fit(X_train,y_train)\n",
    "# rfc.fit(X_train,y_train)\n",
    "gbc.fit(X_train,y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "开始预测！\n",
      "AUC: 0.8293387542510773\n"
     ]
    }
   ],
   "source": [
    "#4.预测结果\n",
    "print('开始预测！')\n",
    "# y_pre = lsvm.decision_function(X_test)\n",
    "# y_pre = rfc.predict_proba(X_test)[:,1]\n",
    "y_pre = gbc.predict_proba(X_test)[:,1]\n",
    "auc = metrics.roc_auc_score(y_test, y_pre)\n",
    "print('AUC:',auc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10040,)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_pre.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "raw AUC: 0.8130921791239684  \n",
    "raw 去除nan列 AUC: 0.8125236195981279  \n",
    "raw 去除nan列 + 统计null AUC: 0.8263001992035767   \n",
    "nodup + null AUC: 0.8261527677046496  \n",
    "nodup + null + tag AUC:0.8296696198580618  \n",
    "nodup + null + tag (前4000维)AUC: 0.8251877022906782    \n",
    "nodup + null + tag (前5990维) AUC: 0.821991126741565  \n",
    "nodup + null + tag (前3000维) AUC: 0.8275583356303323  \n",
    "nodup + null + tag (前2000维) AUC: 0.8249837418081847"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "nodup + null + tag (PCA降至8维) AUC: 0.5830606257872517  \n",
    "nodup + null + tag (fillna(-1))AUC: 0.8313812914152184  \n",
    "nodup + null + tag（添加1维lda特征）AUC: 0.7196871063741885"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "xgboost  \n",
    "linearn-svm AUC: 0.5075515879920058  线性不可分  \n",
    "rfc AUC: 0.804115098223589  \n",
    "gbc AUC: 0.8293387542510773  \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_score  = model.get_fscore()\n",
    "feature_rank = sorted(feature_score,key=lambda x:x[1],reverse=True)\n",
    "import pickle\n",
    "pickle.dump(feature_rank,open('./cv_feat_rank.pkl','wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import pickle\n",
    "# feature_rank = pickle.load(open('./cv_feat_rank.pkl','rb'))\n",
    "# feature_rank"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "14"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import gc\n",
    "del dtrain,dtest,model\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 训练与预测"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 27.8 s, sys: 1.86 s, total: 29.6 s\n",
      "Wall time: 29.6 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "valid_date = pd.read_csv('../../preprocess_data/valid_date.csv').drop(columns=['id','loan_hour'])\n",
    "# valid_raw = pd.read_csv('../../preprocess_data/valid.csv').drop(columns=['id','loan_dt'])\n",
    "valid_raw = pd.read_csv('../../preprocess_data_new/valid_nodup.csv').drop(columns=['id','loan_dt'])\n",
    "valid_null = pd.read_csv('../../preprocess_data_new/valid_row_null.csv').drop(columns=['id'])\n",
    "valid_tag = pd.read_csv('../predict_tag/tag.csv',usecols=['tag'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1.46 s, sys: 0 ns, total: 1.46 s\n",
      "Wall time: 1.46 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "valid = pd.concat([valid_date,valid_raw,valid_null,valid_tag],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = data.fillna(-1).values\n",
    "test = valid.fillna(-1).values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_id = pd.read_csv('../../preprocess_data_new/valid_date.csv',usecols=['id']).values.ravel()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 使用Random Forest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=36)]: Done 128 tasks      | elapsed:    5.6s\n",
      "[Parallel(n_jobs=36)]: Done 378 tasks      | elapsed:   14.5s\n",
      "[Parallel(n_jobs=36)]: Done 728 tasks      | elapsed:   27.4s\n",
      "[Parallel(n_jobs=36)]: Done 1178 tasks      | elapsed:   43.4s\n",
      "[Parallel(n_jobs=36)]: Done 1728 tasks      | elapsed:  1.0min\n",
      "[Parallel(n_jobs=36)]: Done 2378 tasks      | elapsed:  1.4min\n",
      "[Parallel(n_jobs=36)]: Done 2500 out of 2500 | elapsed:  1.5min finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "RandomForestClassifier(bootstrap=True, class_weight='balanced',\n",
       "            criterion='gini', max_depth=None, max_features='sqrt',\n",
       "            max_leaf_nodes=None, min_impurity_decrease=0.0,\n",
       "            min_impurity_split=None, min_samples_leaf=1,\n",
       "            min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
       "            n_estimators=2500, n_jobs=36, oob_score=False,\n",
       "            random_state=2018, verbose=1, warm_start=False)"
      ]
     },
     "execution_count": 94,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "rfc  =RandomForestClassifier(n_estimators=2500,n_jobs =36,max_features='sqrt',class_weight='balanced',verbose =1,random_state=2018)\n",
    "\n",
    "rfc.fit(train,data_label.values.ravel())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=36)]: Done 128 tasks      | elapsed:    0.2s\n",
      "[Parallel(n_jobs=36)]: Done 378 tasks      | elapsed:    0.5s\n",
      "[Parallel(n_jobs=36)]: Done 728 tasks      | elapsed:    0.7s\n",
      "[Parallel(n_jobs=36)]: Done 1178 tasks      | elapsed:    1.0s\n",
      "[Parallel(n_jobs=36)]: Done 1728 tasks      | elapsed:    1.4s\n",
      "[Parallel(n_jobs=36)]: Done 2378 tasks      | elapsed:    1.8s\n",
      "[Parallel(n_jobs=36)]: Done 2500 out of 2500 | elapsed:    1.9s finished\n"
     ]
    }
   ],
   "source": [
    "import joblib\n",
    "joblib.dump(rfc,'./model/rfc.model')\n",
    "\n",
    "prob = rfc.predict_proba(test)[:,1]\n",
    "pred = pd.DataFrame({'id':test_id, 'prob':prob})\n",
    "pred.to_csv('./rfc_pred.csv',index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 使用Gradient Boosting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "      Iter       Train Loss      OOB Improve   Remaining Time \n",
      "         1           0.4505           0.0129          288.73m\n",
      "         2           0.4237           0.0083          297.08m\n",
      "         3           0.4106           0.0078          296.12m\n",
      "         4           0.3975           0.0049          298.18m\n",
      "         5           0.3826           0.0059          299.02m\n",
      "         6           0.3710           0.0042          299.36m\n",
      "         7           0.3645           0.0032          294.56m\n",
      "         8           0.3484           0.0028          294.55m\n",
      "         9           0.3448           0.0022          288.63m\n",
      "        10           0.3380           0.0030          288.74m\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "gbc = GradientBoostingClassifier(loss='deviance',learning_rate =0.1,n_estimators=300,subsample=0.9,max_depth=7,verbose=1,random_state=2018)\n",
    "gbc.fit(train,data_label.values.ravel())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import joblib\n",
    "joblib.dump(gbc,'./model/gbc.model')\n",
    "\n",
    "prob = gbc.predict_proba(test)[:,1]\n",
    "pred = pd.DataFrame({'id':test_id, 'prob':prob})\n",
    "pred.to_csv('./gbc_pred1.csv',index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "raw 去除nan列 + 统计null AUC: 0.82751113123698  \n",
    "nodup + null + tag AUC: 0.82803008109167  \n",
    "nodup + null + tag（rank融合）AUC:0.8279914450872  \n",
    "nodup + null + tag (fillna(-1)) AUC:0.82979480823375  \n",
    "使用GradientBoostingClassifier AUC:0.81958272831703"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
